library(mosaic)
library(tidyverse)
library(lubridate)
library(DataComputing)
library(rvest)
library(broom)
As COVID-19 spreads at an alarming rate, a pressing question at a global scale emerges– what factors of a country contribute to the spread of Coronavirus. The factors which we will analyze are: population density, and proximity to origin point (China).
Data Source 1: COVID
COVID <- read.csv(file = "total-covid-cases-deaths-per-million.csv")
COVID
COVID %>%
nrow()
[1] 9487
COVID %>%
names()
[1] "total.covid.cases.deaths.per.million" "X" "X.1"
[4] "X.2" "X.3" "X.4"
[7] "X.5" "X.6" "X.7"
[10] "X.8" "X.9" "X.10"
[13] "X.11" "X.12" "X.13"
[16] "X.14" "X.15" "X.16"
[19] "X.17" "X.18" "X.19"
[22] "X.20" "X.21" "X.22"
[25] "X.23" "X.24" "X.25"
[28] "X.26" "X.27" "X.28"
[31] "X.29" "X.30" "X.31"
[34] "X.32" "X.33" "X.34"
[37] "X.35" "X.36" "X.37"
[40] "X.38" "X.39" "X.40"
[43] "X.41" "X.42" "X.43"
[46] "X.44" "X.45" "X.46"
[49] "X.47" "X.48" "X.49"
[52] "X.50" "X.51" "X.52"
[55] "X.53" "X.54" "X.55"
[58] "X.56" "X.57" "X.58"
[61] "X.59" "X.60" "X.61"
[64] "X.62" "X.63" "X.64"
[67] "X.65" "X.66" "X.67"
[70] "X.68" "X.69" "X.70"
[73] "X.71" "X.72" "X.73"
[76] "X.74" "X.75" "X.76"
[79] "X.77" "X.78" "X.79"
[82] "X.80" "X.81" "X.82"
[85] "X.83" "X.84" "X.85"
[88] "X.86" "X.87" "X.88"
[91] "X.89" "X.90" "X.91"
[94] "X.92" "X.93" "X.94"
[97] "X.95" "X.96" "X.97"
[100] "X.98" "X.99" "X.100"
[103] "X.101" "X.102" "X.103"
[106] "X.104" "X.105" "X.106"
[109] "X.107" "X.108" "X.109"
[112] "X.110" "X.111" "X.112"
[115] "X.113" "X.114" "X.115"
[118] "X.116" "X.117" "X.118"
[121] "X.119" "X.120" "X.121"
[124] "X.122" "X.123" "X.124"
[127] "X.125" "X.126" "X.127"
[130] "X.128" "X.129" "X.130"
[133] "X.131" "X.132" "X.133"
[136] "X.134" "X.135" "X.136"
[139] "X.137" "X.138" "X.139"
[142] "X.140" "X.141" "X.142"
[145] "X.143" "X.144" "X.145"
[148] "X.146" "X.147" "X.148"
[151] "X.149" "X.150" "X.151"
[154] "X.152" "X.153" "X.154"
[157] "X.155" "X.156" "X.157"
[160] "X.158" "X.159" "X.160"
[163] "X.161" "X.162" "X.163"
[166] "X.164" "X.165" "X.166"
[169] "X.167" "X.168" "X.169"
[172] "X.170" "X.171" "X.172"
[175] "X.173" "X.174" "X.175"
[178] "X.176" "X.177" "X.178"
[181] "X.179" "X.180" "X.181"
[184] "X.182" "X.183" "X.184"
[187] "X.185" "X.186" "X.187"
[190] "X.188" "X.189" "X.190"
[193] "X.191" "X.192" "X.193"
[196] "X.194" "X.195" "X.196"
[199] "X.197" "X.198" "X.199"
[202] "X.200" "X.201" "X.202"
[205] "X.203" "X.204" "X.205"
[208] "X.206" "X.207" "X.208"
[211] "X.209" "X.210" "X.211"
[214] "X.212" "X.213" "X.214"
[217] "X.215" "X.216" "X.217"
[220] "X.218" "X.219" "X.220"
[223] "X.221" "X.222" "X.223"
[226] "X.224" "X.225" "X.226"
[229] "X.227" "X.228" "X.229"
[232] "X.230" "X.231" "X.232"
[235] "X.233" "X.234" "X.235"
[238] "X.236" "X.237" "X.238"
[241] "X.239" "X.240" "X.241"
[244] "X.242" "X.243" "X.244"
[247] "X.245" "X.246" "X.247"
[250] "X.248" "X.249" "X.250"
[253] "X.251" "X.252" "X.253"
[256] "X.254"
COVID %>%
head()
Data Source 2: CountryData
CountryData
CountryData %>%
nrow()
[1] 256
CountryData %>%
names()
[1] "country" "area" "pop" "growth" "birth" "death"
[7] "migr" "maternal" "infant" "life" "fert" "health"
[13] "HIVrate" "HIVpeople" "HIVdeath" "obesity" "underweight" "educ"
[19] "unemploymentYouth" "GDP" "GDPgrowth" "GDPcapita" "saving" "indProd"
[25] "labor" "unemployment" "family" "tax" "budget" "debt"
[31] "inflation" "discount" "lending" "narrow" "broad" "credit"
[37] "shares" "balance" "exports" "imports" "gold" "externalDebt"
[43] "homeStock" "abroadStock" "elecProd" "elecCons" "elecExp" "elecImp"
[49] "elecCap" "elecFossil" "elecNuc" "elecHydro" "elecRenew" "oilProd"
[55] "oilExp" "oilImp" "oilRes" "petroProd" "petroCons" "petroExp"
[61] "petroImp" "gasProd" "gasCons" "gasExp" "gasImp" "gasRes"
[67] "mainlines" "cell" "netHosts" "netUsers" "airports" "railways"
[73] "roadways" "waterways" "marine" "military"
CountryData %>%
head()
COVID
Since our analysis is focused on the spread of COVID-19, we select only columns which pertain to the number of COVID-19 cases in countries over time.
TidyCOVID <- COVID %>%
rename(country = total.covid.cases.deaths.per.million ) %>%
rename( Code = X ) %>%
rename(date = X.1 ) %>%
rename(casesPerMillion = X.3) %>%
filter(row_number() > 1) %>%
subset(select = c(1,3,5)) %>%
mutate( country = as.character(country) ) %>%
mutate(date = mdy(date)) %>%
mutate(casesPerMillion = as.integer(casesPerMillion) - 1)
TidyCOVID
WHAT DOES AN INSTANCE REPRESENT NOW??
Select CountryData relevant to our analysis: area (sq km), pop (number of people) calculate new variable: popdensity: number of people per sq km
RelevantCountryData <-
CountryData %>%
subset(select = c(1,2,3)) %>%
mutate(popdensity = round(pop/area, digits = 2))
RelevantCountryData
COVIDGrowth <- inner_join(TidyCOVID, RelevantCountryData, by = c("country"))
COVIDGrowth
Calculate the number of cases in each country by multiplying casesPerMillion by population (in millions). This variable is now a standardized metric with which we can compare countries.
COVIDGrowth <-
COVIDGrowth %>%
mutate("cases" = (casesPerMillion * round(pop/1000000, digits = 0))) %>%
subset(select = c("country", "date", "cases", "pop", "popdensity"))
COVIDGrowth
This table records the first date that a country recorded a nonzero number of COVID-19 cases.
FirstInstance <-
COVIDGrowth %>%
filter(cases != 0) %>%
group_by(country) %>%
summarise(beginningofspread = min(date))
FirstInstance
This table averages the number of case increase per day from the first day a country had COVID-19 to the most recent in the data table (April 5 2020)
DailySpread <-
left_join(COVIDGrowth, FirstInstance, by = c("country")) %>%
filter(date == "2020-04-05") %>%
mutate(dayselapsed = date - beginningofspread) %>%
mutate(dailyspread = cases / as.numeric(dayselapsed) ) %>%
subset(select = c("country", "beginningofspread", "dailyspread"))
DailySpread$dailyspread[is.na(DailySpread$dailyspread)] <- 0
DailySpread
COVIDFinal <-
left_join(COVIDGrowth, DailySpread, by = c("country"))
COVIDFinal
COVIDFinal %>%
group_by(date) %>%
summarise(totalcases = sum(cases)) %>%
ggplot(aes(x = date, y = totalcases)) +
geom_point()
COVIDFinal %>%
group_by(country) %>%
summarise(dailyspread = mean(dailyspread)) %>%
arrange(desc(dailyspread)) %>%
head(20) %>%
ggplot(aes(x = reorder(country, desc(dailyspread)), y= dailyspread)) +
geom_bar(stat="identity", position = 'stack', width=.9) +
theme(axis.text.x=element_text(angle = 60, hjust = 1)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
xlab("Country") +
ylab("Spread of COVID-19 cases Per Day")
COVIDFinal %>%
ggplot(aes(x = pop, y = dailyspread)) +
geom_point()